Here I would go through the data and demonstrate the models fitting to the data.
I saved the pre-prepared AIF data in the Rawdata folder. This dataset consists of 10 patients.
# load data
filenames = list.files(path = "../Rawdata", pattern="*.csv") ##name of files
data = tibble(
patient = substr(filenames,1,5), #the first 5 character of filenames
count = map(filenames,~read.csv(paste0("../Rawdata/", .)) %>% select(-X))
)
# find tmax and slice the curve
data = data %>%
group_by(patient) %>%
mutate(count = map(count,~findtmax(.x)), # the tmax is the one with max aif
count_asc = map(count, ~slice_asc(.x)), # data before tmax
count_dsc = map(count, ~slice_exp(.x))) # data after tmax and time=time-tmax; t_G=t_G-tmax
Take the first subject: Patient jdcs1 for example.
subject_1 = data$count[1]%>% as.data.frame()
ggplot(subject_1, aes( x = time, y = disp_aif))+
geom_point(shape = "diamond", size = 2)+
labs(x = "Time", y = "AIF",
title = "AIF over time for patient jdcs1")+
theme(axis.title.x = element_text(vjust = 0, size = 15),
axis.title.y = element_text(vjust = 2, size = 15))+
theme_light()
Over time, AIF would sharply goes up and than slowly goes down. We name the time when AIF reaches the peak as \(tmax\). For AIF before the peak, we used simple linear regression to model it. We were mainly interested in AIF after the peak. We would use parametric and non-parametric ways to model data after \(tmax\).
datatable(subject_1)
The way we calculate AIF: \(disp\_aif = \frac{count\times exp^{\frac{log(2)\times time}{20}}\times 0.037\times (0.07756609407608\times exp^{0.080728586340915\times vol})\times parentFraction}{bpr\times disp\_fct\times delta\times vol}\)
The way we set offsets: offset =log(delta)+log(vol)+log(disp_fct)+(-log(2)/20.364*t_G)+(-log(0.003))+(-0.0807*vol)+(-log(parentFraction))+log(bpr)
name in the offsets code : explanation
pare_data = data %>%
group_by(patient) %>%
mutate(asc_res = map(count_asc, ~acs_inter(.x)), # detect t0 and interpolate aif between t0 and tmax
count_asc = map(asc_res, ~.x$data), # add t0 to the data
asc_mod = map(asc_res, ~.x$segmod), # save model that detecting t0
dsc_mod = map(count_dsc, # fit nonlinear poisson regression for descending part
~kinfitr_gnm(t = .x$time, # time since tmax
t_G = .x$t_G, # time point put in gamma count since injection time
y.sum = .x$count,
delta = .x$delta, # time in the gamma counter
vol = .x$vol,
pf = .x$parentFraction,
bpr = .x$bpr,
disp = rep(1,nrow(.x))
)
),
dsc_mod = map(dsc_mod, ~.x$result), # save model fit data after tmax
asc_pred = map(asc_res, ~.x$pred), # get prediction before tmax
# get prediction after tmax, contain interpolated aif
dsc_pred = map2(dsc_mod,count_dsc,~pred_aif(.x,.y))
) %>%
select(-asc_res)
for (i in 1:nrow(pare_data)){
patient = pare_data$patient[i]
data_line = pare_data$count_dsc[i] %>% as.data.frame()
para_data = pare_data$dsc_pred[[i]]$rsd %>% as.data.frame()
# plot of continuous data
con_data = data_line %>% filter(Method == "Continuous")
plot(con_data$time, log(con_data$disp_aif), col= "grey",
xlab = "time(min)",
ylab = "log(AIF)",
xlim = c(0,90),
ylim = c(-2,5),
main = paste0("Parametric Regression for patient:", patient))
legend(50,5,c("Continuous data"), text.col = "grey",bty = "n")
# plot of discrete data
dis_data = data_line %>% filter(Method == "Discrete")
par(new = TRUE)
plot(dis_data$time, log(dis_data$disp_aif), col= "red",
xlab = "time(min)",
ylab = "log(AIF)",
xlim = c(0,90),
ylim = c(-2,5),
main = "")
legend(50,4.5,c("Discrete data"), text.col = "red",bty = "n")
# plots of parametric regression
par(new = TRUE)
plot(para_data$time_dsc, log(para_data$pred), type = "l", col = "darkorange2", lwd = 2,
xlab = "time(min)",
ylab = "log(AIF)",
xlim = c(0,90),
ylim = c(-2,5),
main = ""
)
legend(50,4,c("Predicted value"), text.col = "darkorange2",bty = "n")
}
For the non-parametric poisson regression, we used \(SCAM\) function to achieve monotone decreasing smooths.
poisson_regress = function(data = data, k_value = 15){
fit_res = scam(count ~ s(time,k = k_value, bs="mpd")+ Method,
offset = log(delta)+log(vol)+log(disp_fct)+(-log(2)/20.364*t_G)+(-log(0.003))+(-0.0807*vol)
+(-log(parentFraction))+log(bpr)
,family = poisson(link = "log"), data = data)
return(fit_res)
}
Following, we showed fitness for non-parametric poisson regression.
for (i in 1:nrow(data)){
patient = data$patient[i]
data_line = data$count_dsc[i] %>% as.data.frame()
plot_line = data_line %>% poisson_regress()
time = data_line$time
fitted = log(plot_line$fitted.values)
offset = plot_line$model$`(offset)`
method = data_line$Method
discrete = plot_line$coefficients[2]
df = cbind(time, fitted, offset, method, discrete) %>% as.data.frame()
test_df = df %>%
mutate(
time = as.numeric(time),
fitted = as.numeric(fitted),
offset = as.numeric(offset),
discrete = as.numeric(discrete),
aif = fitted - offset
)
# create a data frame for prediction
pred_data = data_frame(
time = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1),
Method = "Discrete",
delta = 10,
vol = 1,
disp_fct = 1,
t_G = time+7,
parentFraction = 0.8,
bpr = 1
)
# predict 10 data points for discrete data at 0.1-1 time intervel
pred_df = predict(plot_line, pred_data) %>% as.data.frame()
time_10 = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1)
pred_df = cbind(time_10, pred_df)
names(pred_df)[1] = "time"
names(pred_df)[2] = "aif"
dis_df = test_df %>% filter(method == "Discrete") %>% select(time,aif)
bind_dis_df = rbind(pred_df, dis_df) %>% mutate(time = as.numeric(time)) %>% arrange(time)
con_df = test_df %>% filter(method == "Continuous")
# plot of continuous data
con_data = data_line %>% filter(Method == "Continuous")
plot(con_data$time, log(con_data$disp_aif), col= "grey",
xlab = "time(min)",
ylab = "log(AIF)",
xlim = c(0,90),
ylim = c(-2,5),
main = paste0("Poisson Regression for patient:", patient))
legend(50,5,c("Continuous data"), text.col = "grey",bty = "n")
# plot of discrete data
dis_data = data_line %>% filter(Method == "Discrete")
par(new = TRUE)
plot(dis_data$time, log(dis_data$disp_aif), col= "red",
xlab = "time(min)",
ylab = "log(AIF)",
xlim = c(0,90),
ylim = c(-2,5),
main = "")
legend(50,4.5,c("Discrete data"), text.col = "red",bty = "n")
# plots of non-parametric regression
par(new = TRUE)
plot(bind_dis_df$time,bind_dis_df$aif,
type = "l", col = "darkgreen", lwd = 2,
xlab = "time(min)",
ylab = "log(AIF)",
xlim = c(0,90),
ylim = c(-2,5),
main = ""
)
legend(50,4,c("Predicted value"), text.col = "darkgreen",bty = "n")
par(new = TRUE)
plot(con_df$time,con_df$aif,
type = "l",lty = 2, col = "darkgreen", lwd = 2,
xlab = "time(min)",
ylab = "log(AIF)",
xlim = c(0,90),
ylim = c(-2,5),
main = ""
)
legend(50,4,c("Predicted value"), text.col = "darkgreen",bty = "n")
}
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
### Problem of over-dispersion
Although the predicted value shows poisson regression model fits well, when we plotted the Q-Q plot, it revealed the problem of over-dispersion.
Red lines: the reference line
Grey bands: the reference bands
for (i in 1:nrow(data)){
patient = data$patient[i]
data_line = data$count_dsc[i] %>% as.data.frame()
plot_line = data_line %>% poisson_regress()
qq.gam(plot_line,rep=500, s.rep = 10, level =1,pch=19,cex=.2, main = "", rl.col = 2)
mtext(paste0("Q-Q plot for patient:", patient), side = 3, line = -1, outer = TRUE)
}
To figure out the problem of over-dispersion, we changed poisson distribution to negative binomial distribution. Because the \(SCAM\) function doesn’t support negative binomial distribution, we also changed the function to \(GAM\).
gam_nb_regress = function(data = data,k_value = 15){
fit_res = gam(count ~ s(time,k = k_value)+Method,
offset = log(delta)+log(vol)+log(disp_fct)+(-log(2)/20.364*t_G)+(-log(0.003))+(-0.0807*vol)
+(-log(parentFraction))+log(bpr)
,family = nb(link = "log"), method="REML",data = data)
return(fit_res)
}
for (i in 1:nrow(data)){
patient = data$patient[i]
data_line = data$count_dsc[i] %>% as.data.frame()
plot_line = data_line %>% gam_nb_regress()
time = data_line$time
fitted = log(plot_line$fitted.values)
offset = plot_line$offset
method = data_line$Method
df = cbind(time, fitted, offset, method) %>% as.data.frame()
test_df = df %>%
mutate(
time = as.numeric(time),
fitted = as.numeric(fitted),
offset = as.numeric(offset),
aif = fitted - offset
)
# create a data frame for prediction
pred_data = data_frame(
time = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1),
Method = "Discrete",
delta = 10,
vol = 1,
disp_fct = 1,
t_G = time+7,
parentFraction = 0.8,
bpr = 1
)
# predict 10 data points for discrete data at 0.1-1 time intervel
pred_df = predict(plot_line, pred_data) %>% as.data.frame()
time_10 = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1)
pred_df = cbind(time_10, pred_df)
names(pred_df)[1] = "time"
names(pred_df)[2] = "aif"
dis_df = test_df %>% filter(method == "Discrete") %>% select(time,aif)
bind_dis_df = rbind(pred_df, dis_df) %>% mutate(time = as.numeric(time)) %>% arrange(time)
con_df = test_df %>% filter(method == "Continuous")
# plot of continuous data
con_data = data_line %>% filter(Method == "Continuous")
plot(con_data$time, log(con_data$disp_aif), col= "grey",
xlab = "time(min)",
ylab = "log(AIF)",
xlim = c(0,90),
ylim = c(-2,5),
main = paste0("Regression for patient:", patient))
legend(50,5,c("Continuous data"), text.col = "grey",bty = "n")
# plot of discrete data
dis_data = data_line %>% filter(Method == "Discrete")
par(new = TRUE)
plot(dis_data$time, log(dis_data$disp_aif), col= "red",
xlab = "time(min)",
ylab = "log(AIF)",
xlim = c(0,90),
ylim = c(-2,5),
main = "")
legend(50,4.5,c("Discrete data"), text.col = "red",bty = "n")
# plots of non-parametric regression
par(new = TRUE)
plot(bind_dis_df$time,bind_dis_df$aif,
type = "l", col = "darkgreen", lwd = 2,
xlab = "time(min)",
ylab = "log(AIF)",
xlim = c(0,90),
ylim = c(-2,5),
main = ""
)
legend(50,4,c("Predicted value"), text.col = "darkgreen",bty = "n")
par(new = TRUE)
plot(con_df$time,con_df$aif,
type = "l",lty = 2, col = "darkgreen", lwd = 2,
xlab = "time(min)",
ylab = "log(AIF)",
xlim = c(0,90),
ylim = c(-2,5),
main = ""
)
legend(50,4,c("Predicted value"), text.col = "darkgreen",bty = "n")
}
### Solved problem of over-dispersion
Red lines: the reference line
Grey bands: the reference bands
for (i in 1:nrow(data)){
patient = data$patient[i]
data_line = data$count_dsc[i] %>% as.data.frame()
plot_line = data_line %>% gam_nb_regress()
qq.gam(plot_line,rep=500)
mtext(paste0("Q-Q plot for patient:", patient), side = 3, line = -1, outer = TRUE)
}
#Create a dataset for all patients' data
total_data = data_frame()
for(i in 1:nrow(data)){
patient = data$patient[i]
data_line = data$count_dsc[i] %>% as.data.frame()
p_data = cbind(patient, data_line)
total_data = rbind(total_data, p_data)
}
# dataset for manual data only
discrete_data = total_data %>%mutate(patient = as.factor(patient))%>% filter(Method == "Discrete")
gam_total_nb_regress = function(data = data){
fit_res = gam(count ~ Method*patient+patient+s(time,k = 20)+s(time, patient, k=5, bs="fs"),
offset = log(delta)+log(vol)+log(disp_fct)+(-log(2)/20.364*t_G)+(-log(0.003))+(-0.0807*vol)
+(-log(parentFraction))+log(bpr)
,family = nb(link = "log"),data = data,method="REML")
return(fit_res)
}
total_data = total_data %>% mutate(
patient = as.factor(patient)
)
total_line = total_data%>% gam_total_nb_regress()
## Warning in gam.side(sm, X, tol = .Machine$double.eps^0.5): model has repeated 1-
## d smooths of same variable.
plot(total_line, main = "Regression for all patients")
qq.gam(total_line,rep=500)
mtext(paste0("Q-Q plot for patient:", patient), side = 3, line = -1, outer = TRUE)
By using negative binomial regression, we dealed with the problem of over-dispersion and got model with good fit. However those models were too wiggly. We wanted to increase the smoothness by log-transform \(time\).
First, we added back tmax for all patients’ data. Then, we log-transformed time value and fit the model. Following shows the regression model, residual plot, and how the model fitted.
tmax = c()
nrow = c()
patients = c()
for(i in 1:nrow(data)){
patient = data$patient[i]
patient_data = data$count_dsc[i]%>% as.data.frame()%>% mutate(
tmax = as.numeric(tmax),
time = as.numeric(time)
)
patient_tmax = mean(patient_data$tmax)
number = nrow(patient_data)
patients = rbind(patients,patient)
tmax = rbind(tmax, patient_tmax)
nrow = rbind(nrow,number)
# transform time : time =ln(time + tmax)
time_data = patient_data %>% mutate(time = log(time+tmax))
plot_line = time_data %>% gam_nb_regress()
# plot of the model
par(mfrow = c(1,1))
plot(plot_line,
shade=F, # confidence bands for smooth
se=F,
lwd = 1.5,
main = paste0("Regression for patient:", patient))
# Q-Q plot
qq.gam(plot_line,rep=500)
mtext(paste0("Q-Q plot for patient:", patient), side = 3, line = -1, outer = TRUE)
# plot of predicted value
time = time_data$time
fitted = log(plot_line$fitted.values)
offset = plot_line$offset
method = time_data$Method
df = cbind(time, fitted, offset, method) %>% as.data.frame() %>% mutate(
fitted = as.numeric(fitted),
offset = as.numeric(offset),
aif = fitted-offset
)
con_df = df %>% filter(method == "Continuous")
dis_df = df %>% filter(method == "Discrete")
par(mfrow = c(1,1))
con_data = time_data %>% filter(Method == "Continuous")
plot(con_data$time, log(con_data$disp_aif), col= "grey",
xlab = "log(time)",
ylab = "log(AIF)",
xlim = c(-1,5),
ylim = c(-2,5),
main = paste0("Regression for patient:", patient))
legend(3,5,c("Continuous data"), text.col = "grey",bty = "n")
# plot of discrete data
dis_data = time_data %>% filter(Method == "Discrete")
par(new = TRUE)
plot(dis_data$time, log(dis_data$disp_aif), col= "red",
xlab = "log(time)",
ylab = "log(AIF)",
xlim = c(-1,5),
ylim = c(-2,5),
main = "")
legend(3,4.5,c("Discrete data"), text.col = "red",bty = "n")
# plots of non-parametric regression
par(new = TRUE)
plot(con_df$time,con_df$aif,
type = "l",lty = 2, col = "darkgreen", lwd = 2,
xlab = "log(time)",
ylab = "log(AIF)",
xlim = c(-1,5),
ylim = c(-2,5),
main = ""
)
legend(3,4,c("Predicted value"), text.col = "darkgreen",bty = "n")
par(new = TRUE)
plot(dis_df$time,dis_df$aif,
type = "l", col = "darkgreen", lwd = 2,
xlab = "log(time)",
ylab = "log(AIF)",
xlim = c(-1,5),
ylim = c(-2,5),
main = ""
)
legend(3,4,c("Predicted value"), text.col = "darkgreen",bty = "n")
}
table = cbind(patients, tmax, nrow) %>% as.tibble()
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
colnames(table) = c("Patient", "Tmax", "Number_of_data")
table
## # A tibble: 10 x 3
## Patient Tmax Number_of_data
## <chr> <chr> <chr>
## 1 jdcs1 0.666666667 570
## 2 jdcs2 0.716666667 567
## 3 mhco1 0.95 553
## 4 mhco2 1.066666667 546
## 5 rwrd1 1.15 540
## 6 rwrd2 1.083333333 544
## 7 xehk1 1.35 528
## 8 xehk2 1 549
## 9 ytdh1 0.833333333 560
## 10 ytdh2 0.683333333 569
We found the mean of tmax for all patients and added back tmax for all patients’ data.
table = table %>% mutate(
Tmax = as.numeric(Tmax),
Number_of_data = as.numeric(Number_of_data),
sum = Tmax*Number_of_data
)
mean_tmax = sum(table$sum)/sum(table$Number_of_data)
total_data = data_frame()
for(i in 1:nrow(data)){
patient = data$patient[i]
data_line = data$count_dsc[i] %>% as.data.frame()
p_data = cbind(patient, data_line)
total_data = rbind(total_data, p_data)
}
total_data = total_data %>% mutate(
patient = as.factor(patient),
time = log(time+mean_tmax)
)
total_line = total_data%>% gam_total_nb_regress()
## Warning in gam.side(sm, X, tol = .Machine$double.eps^0.5): model has repeated 1-
## d smooths of same variable.
plot(total_line, main = "Regression for all patients")
qq.gam(total_line,rep=500)
mtext(paste0("Q-Q plot for patient:", patient), side = 3, line = -1, outer = TRUE)